
# R code for statistical analyses in:
# Roulette et al. 2014. Tobacco use vs. helminths in Congo basin hunter-gatherers: self-medication in humans?
# doi:10.1016/j.evolhumbehav.2014.05.005

# Note: Some estimates in the publication, which were obtained with older versions of R and supporting 
# packages (e.g. mgcv), are slightly different than those obtained with more recent versions.

library(car)
library(ggplot2)
library(lattice)
library(mgcv)
library(coin)
library(effects)
library(reshape2)
library(stargazer)
library(pROC)

### Adjust width for text wrapping in RStudio notebook ###

options(width=100)

### Read data ###

samples = read.csv('samples.csv', as.is='sample_date')
interview = read.csv('interview.csv')
cyp = read.csv('cyp.csv', as.is=c('cyp2a6.1', 'cyp2a6.2'))

interview$village = factor(interview$region == 1) # Explained in paper

# Although, in the interview data frame, females are assigned to 'treatment' and 'control' groups,
# this is an artifact: after collecting s3 from females, we gave them a packet with a
# code number (which we recorded), but we instructed them to take both pills immediately.
# Then, when we broke the seal on the code list and assigned treatment and control based
# on packet code #, females were 'assigned' to one condition.

interview$intervention[interview$sex=='f'] = NA

### Calculate worm burden values ###

# Non-integer values due to volume correction on raw data

samples$direct_worm = samples$direct_ascaris + samples$direct_ancylo + samples$direct_trich
samples$kato_worm = samples$kato_ascaris + samples$kato_ancylo + samples$kato_trich
samples$mif_worm = samples$mif_ascaris + samples$mif_trich + samples$mif_ancylo

# mif is most sensitive technique, so weight it double
samples$worm = samples$direct_worm + samples$kato_worm + 2 * samples$mif_worm

samples$ascaris = samples$direct_ascaris + samples$kato_ascaris + samples$mif_ascaris
samples$ancylo = samples$direct_ancylo + samples$kato_ancylo + samples$mif_ancylo
samples$trich = samples$direct_trich + samples$kato_trich + samples$mif_trich

### Calculate wealth and acculturation ###

interview$wealth = interview$radio + interview$watch + interview$flashlight + interview$clothes

# Church item is missing 8 responses more than the other acculturation items
# Set those to "n" (or "y") so we can still use most of the church data

interview$church[is.na(interview$church)] = 'n'

interview$acculturation = (as.numeric(interview$forest_village) - 1) +
                          (as.numeric(interview$school_mona) - 1) + 
                          (as.numeric(interview$school_bokala_ngondoa) - 1) + 
                          (as.numeric(interview$church) - 1)

# Number of men and women in study
table(interview$sex)

#####################
### Summary stats ###
#####################

## Study attrition (males)

# Note slightly higher numbers for s2. The reason is:
# A few men returned from the forest as we were collecting
# s2, and wanted to join the study, so we just assigned those samples to s2, even though
# they were really s1 for those guys. Those guys just provided two samples, which we labeled s2 and s3.

male_ids = interview$id[interview$sex=='m']
t = table(samples$sample_num[samples$id %in% male_ids])
print(t)/length(male_ids)

# Table of summary stats for interview data (males)
stargazer(interview[interview$sex=='m', -c(1, 18)], type='text', summary=T) # Remove id and worm_pill from summary

###################################
### Helminth infection patterns ###
###################################

pre_samples = samples[samples$pre_post=='pretreatment',]

helminths = aggregate(cbind(ascaris, ancylo, trich, worm) ~ id, FUN=mean, data=pre_samples)
helminths = merge(helminths, interview[,c('id', 'sex')], by='id')

n = nrow(helminths)

# Fraction infected with Ascaris
sum(helminths$ascaris>0)/n

# Fraction infected with Hookworm
sum(helminths$ancylo>0)/n

# Fraction infected with T. trichiura
sum(helminths$trich>0)/n

# Polyparasitism (fraction of individuals infected with 0, 1, 2, or 3 species)
helminths$polyparasitism = (helminths$ancylo>0) + (helminths$trich>0) + (helminths$ascaris>0)
table(helminths$polyparasitism)/n

# Testing for sig. sex difference in uninfected status
# Continuity correction yields marginal diff. Monte Carlo yields p < 0.05.
chisq.test(helminths$polyparasitism==0, helminths$sex)
chisq.test(helminths$polyparasitism==0, helminths$sex, simulate.p.value=T)

# Testing for sig. sex differences in worm burden scores
# Marginally sig. sex difference for ascaris by permutation test
oneway_test(ascaris ~ sex, data=helminths)
oneway_test(ancylo ~ sex, data=helminths)
oneway_test(trich ~ sex, data=helminths)
oneway_test(worm ~ sex, data=helminths)

ggplot(helminths,aes(x = worm)) + stat_ecdf(aes(colour = sex))

# Hints of a 'wormy' subpopulation, consistent with other studies
qplot(sex, worm, geom='violin', data=helminths) + geom_dotplot(binaxis="y", stackdir='center', binwidth=1.5)

########################
### Smoking patterns ###
########################

# Self reported smoking by sex

# Frequencies
t = table(interview$sex, interview$smoker)
t

# Proportions
prop.table(t, 1) 

# Cotinine smoking by sex

mean_cot = aggregate(cotinine ~ id, FUN=mean, data=pre_samples)
mean_cot = merge(mean_cot, interview[, c('id', 'sex')])

# Frequencies
t = table(mean_cot$sex, mean_cot$cotinine>5) # smoking threshold = 5 ng/ml
t

# Proportions
prop.table(t, 1)

##############################
### Descriptive statistics ###
##############################

## Aggregate baseline values and merge with interview data

mean_cot = aggregate(cotinine ~ id, FUN=mean, data = pre_samples)

# sum worm burden because it is basically egg counts
sum_worm = aggregate(worm ~ id, FUN=sum, data = pre_samples)

# Use number of samples as exposure (offset) variable
stool_count = aggregate(worm ~ id, FUN=function(x) sum(!is.na(x)), pre_samples)
names(stool_count) = c('id', 'stool_count')

baseline.data = merge(mean_cot, sum_worm, by='id')
baseline.data = merge(baseline.data, stool_count, by='id')
baseline.data$worm.mean = baseline.data$worm/baseline.data$stool_count

# The interview variables we want
cols = c('id', 'sex', 'age', 'smoker', 'village', 'wealth', 'acculturation', 'worm_pill')
baseline.data = merge(baseline.data, interview[,cols], by='id', all.y=T)

# ROC analysis (determine cotinine threshold for nonsmokers vs. smokers)
baseline.data$true.smoker = rep(NA, nrow(baseline.data))
baseline.data$true.smoker[baseline.data$smoker=='n' & baseline.data$sex=='f' & baseline.data$cotinine<50] = F
baseline.data$true.smoker[baseline.data$smoker=='y' & baseline.data$sex=='m'] = T

qplot(cotinine, fill=true.smoker, position='dodge', geom='histogram', data=baseline.data) +
  geom_vline(xintercept=5) +
  scale_x_log10(breaks=c(.1, 1, 5, 10, 50, 100, 500)) +
  labs(x='\nCotinine (ng/ml)')

m = roc(true.smoker ~ cotinine, data=baseline.data)
plot(m, print.thres='best')
coords(m, 5)

# Print descriptive stats tables for males and females
cols2 = c('age', 'cotinine', 'worm.mean', 'wealth', 'acculturation')

mf = list(baseline.data[baseline.data$sex=='m',cols2],
          baseline.data[baseline.data$sex=='f',cols2])

stargazer(mf, type='text', digits=1, median=T, title=c('Males', 'Females'))

####################################################################
### Cross-sectional relationship between smoking and worm burden ###
####################################################################

# Men only
burden = baseline.data[baseline.data$sex=='m',]

burden$baseline_cotinine = sqrt(burden$cotinine) # cotinine is highly skewed

number_samples = length(burden$id[!is.na(burden$cotinine) & !is.na(burden$worm)])
number_interviews = length(burden$id[!is.na(burden$wealth)])
clinic_visitors = sum(burden$worm_pill==T, na.rm=T)

# Notes: there are NA's. Also: 7 folks who were treated for worms at clinic not eligible.
# Missing interview data: some interview data was collected when collecting sample 3.
# Some folks provided s1 and s2, but didn't show up for s3. Hence, we have cotinine
# and worm values for them, but no wealth or acculturation values.

# There is one participant, id=76, with everything except an NA for worm_pill.
# Rather than delete, just change to FALSE so that they are included.
# This is justified since worm_pill = TRUE is rare.

burden$worm_pill[burden$id==76] = FALSE

# Compare worm burden of those who took worm pills @ clinic with those who did not
wilcox.test(worm.mean ~ worm_pill, data=burden)

# GAM of worm burden vs cotinine, with controls
# Using negative binomical because worm burden is v. close to egg count (and overdispersed)

m_burden = gam(worm ~ village + acculturation + wealth + s(age) + s(cotinine) + offset(log(stool_count)), family=negbin(c(1,10)), data=burden[burden$worm_pill==F,])
summary(m_burden)
gam.check(m_burden)

# Plot for pub #
par(mfrow=c(1,2), oma=c(1.5,1.5,0,0) + 0.1, mar = c(4,4,0,0) + 0.1)
plot(m_burden, residuals=T, jit=T, pch=20, select=1, xlab='Age (years)', ylab='Worm burden')
text(65, 4.5, 'A', cex=2)
plot(m_burden, residuals=T, jit=T, pch=20, select=2, xlab='Cotinine (ng/ml)', ylab='', yaxt='n')
text(650, 4.5, 'B', cex=2)

#################################################################
### Effect of albendazole treatment on cotinine concentration ###
#################################################################

# # Alternative aggregating code; variable names slightly different
# f = melt(samples, id.vars=c('id', 'pre_post'), measure.vars=c('cotinine', 'worm'))
# g = dcast(f, id ~ pre_post + variable, fun.aggregate=function(x) mean(x, na.rm=T))
# cols = c('id', 'intervention')
# g = merge(g, interview[interview$sex=='m', cols], by='id')

# id 473 is missing post-treatment worm score (which is not required for RCT)
# Average cotinine and worm burden for pre- and post-treatment samples
f = aggregate(cbind(cotinine, worm) ~ id + pre_post, mean, na.rm=T, na.action=na.pass, data=samples)

# Convert to wide format
g = reshape(f, idvar='id', timevar='pre_post', v.names=c('cotinine', 'worm'), direction='wide')

# Include treatment vs control condition
cols = c('id', 'intervention', 'age')
g = merge(g, interview[interview$sex=='m', cols], by='id')

# Eliminate non-RCT participants
g = g[!is.na(g$intervention),]

# Square-root transform cotinine (because skewed), and center variables
g$baseline_cotinine = sqrt(g$cotinine.pretreatment) # cotinine is skewed
g$baseline_worm = g$worm.pretreatment - mean(g$worm.pretreatment, na.rm=T)
g$follow_up_cotinine = sqrt(g$cotinine.posttreatment) # cotinine is skewed

# id 196 is missing baseline cotinine and worm but has follow-up cotinine and worm
g = g[g$id != 196,]

# RCT randomization and attrition

pre_treat    = g$intervention=='treatment' & (!is.na(g$cotinine.pretreatment)  | !is.na(g$worm.pretreatment))
pre_control  = g$intervention=='control'   & (!is.na(g$cotinine.pretreatment)  | !is.na(g$worm.pretreatment))

post_treat   = pre_treat   & !is.na(g$cotinine.posttreatment)
post_control = pre_control & !is.na(g$cotinine.posttreatment)

lost_treat   = pre_treat   & is.na(g$cotinine.posttreatment)
lost_control = pre_control & is.na(g$cotinine.posttreatment)

## Testing randomization

# Some ECDF plots showing baseline values for treatment vs. control groups

ggplot(g,aes(x = baseline_cotinine)) + stat_ecdf(aes(colour = intervention))
ggplot(g,aes(x = baseline_worm)) + stat_ecdf(aes(colour = intervention))
ggplot(g,aes(x = age)) + stat_ecdf(aes(colour = intervention))

# tests

ks.test(g$baseline_cotinine[pre_treat], g$baseline_cotinine[pre_control])
ks.test(g$baseline_worm[pre_treat], g$baseline_worm[pre_control])
ks.test(g$age[pre_treat], g$age[pre_control])

wilcox.test(g$baseline_cotinine[pre_treat], g$baseline_cotinine[pre_control])
wilcox.test(g$baseline_worm[pre_treat], g$baseline_worm[pre_control])
wilcox.test(g$age[pre_treat], g$age[pre_control])

oneway_test(baseline_cotinine ~ intervention, data=g)
oneway_test(baseline_worm ~ intervention, data=g)
oneway_test(age ~ intervention, data=g)

## Testing for biased attrition

g$lost = factor(is.na(g$cotinine.posttreatment))

m.age  = lm(age ~ intervention*lost, data=g)
m.cot  = lm(baseline_cotinine ~ intervention*lost, data=g)
m.worm = lm(worm.pretreatment ~ intervention*lost, data=g)

# Some biased attrition, but no significant interaction with intervention

drop1(m.age, test='F')
drop1(m.cot, test='F')
drop1(m.worm, test='F')

plot(allEffects(m.age))
plot(allEffects(m.cot))
plot(allEffects(m.worm))

# Attrition table

cols = c('age', 'cotinine.pretreatment', 'worm.pretreatment')

stargazer(
  g[pre_treat, cols],
  g[pre_control, cols], 
  summary=T, 
  summary.stat=c('n', 'mean', 'sd'),
  digits = 1,
  title=c('Treatment allocation', 'Control allocation'), 
  type='text'
)

stargazer(
  g[lost_treat, cols],
  g[lost_control, cols], 
  summary=T, 
  summary.stat=c('n', 'mean', 'sd'), 
  digits = 1,
  title=c('Treatment lost to follow-up', 'Control lost to follow-up'), 
  type='text'
)

stargazer(
  g[post_treat, cols],
  g[post_control, cols], 
  summary=T, 
  summary.stat=c('n', 'mean', 'sd'), 
  digits = 1,
  title=c('Remaining in treatment', 'Remaining in control'), 
  type='text'
)

# Effects of 'lost' on age, baseline_cotinine, and worm

t.test(age ~ lost, data=g)
t.test(worm.pretreatment ~ lost, data=g)
t.test(baseline_cotinine ~ lost, data=g)

oneway_test(age ~ lost, data=g)
oneway_test(worm.pretreatment ~ lost, data=g)
oneway_test(baseline_cotinine ~ lost, data=g)


# Some ECDF plots showing baseline values for lost-to-follow-up vs. remaining

ggplot(g,aes(x = baseline_cotinine)) + stat_ecdf(aes(colour = lost))
ggplot(g,aes(x = baseline_worm)) + stat_ecdf(aes(colour = lost))
ggplot(g,aes(x = age)) + stat_ecdf(aes(colour = lost))

# Manipulation check

wilcox.test(g$worm.pretreatment[g$intervention=='treatment'], g$worm.posttreatment[g$intervention=='treatment'], paired=T, alternative='g')
wilcox.test(g$worm.pretreatment[g$intervention=='control'], g$worm.posttreatment[g$intervention=='control'], paired=T)

mean(g$worm.pretreatment[g$intervention=='treatment'], na.rm=T)
mean(g$worm.posttreatment[g$intervention=='treatment'], na.rm=T)

mean(g$worm.pretreatment[g$intervention=='control'], na.rm=T)
mean(g$worm.posttreatment[g$intervention=='control'], na.rm=T)

# Plot worm distribution in treatment vs. control groups s1-s6 (manipulation after s3)
samples2 = merge(samples[,c('id', 'sample_num', 'worm')], interview[,c('id', 'sex', 'intervention')])
samples2 = samples2[samples2$sex=='m' & samples2$sample_num<7 & !is.na(samples2$intervention),]
p = ggplot(samples2, aes(x=factor(sample_num), y=worm))
p = p + facet_grid(intervention~.)
p = p + geom_violin(scale='width', trim=T)
p = p + geom_dotplot(binaxis="y", stackdir='center', binwidth=1.5)
p = p + stat_summary(fun.y = median, geom="point", color='red', size=2, mapping=aes(sample_num, worm))
p = p + geom_vline(xintercept = 3.5, linetype='longdash')
p = p + xlab("\nSample number")
p = p + ylab("Worm burden score\n")
p = p + theme_grey(20)
p

# Fit model
m = lm(follow_up_cotinine ~ baseline_cotinine + baseline_worm * intervention, data = g)
summary(m)
plot(m) # Residual and Q-Q plots not too horrible

# Plot model

# Create data for predicted results, and plot

mean_cot = mean(g$baseline_cotinine, na.rm=T)^2
tmp = data.frame(cotinine=c(mean_cot,mean_cot,mean_cot,75), worm=c(-20,60,-20,60), intervention=c('control', 'control', 'treatment', 'treatment'))
p = xyplot(cotinine~worm|intervention, type='l', xlab='', ylab=' ', ylim=c(60,140), data=tmp)

# Plot the ANCOVA

sqr = function(x){x^2} # Needed to rescale y-axis
e = effect('baseline_worm:intervention', m, transformation=list(link=sqrt, inverse=sqr))
summary(e)

p2 = plot(e, xlab='Baseline worm burden (centered)', ylab='Follow-up cotinine concentration (ng/ml)', main='')

print(p, pos=c(0,.65,1,1), more=T)
print(p2, pos=c(0,0,1,.7))

################################
### Cotinine vs. reinfection ###
################################

# Alternative aggregating code

# f2 = melt(samples, id.vars=c('id', 'pre_post'), measure.vars=c('cotinine', 'worm'))
# g2 = dcast(f2, id ~ variable + pre_post, fun.aggregate=function(x) mean(x, na.rm=T))
# cols = c('id', 'sex', 'age', 'intervention')
# g2 = merge(g2, interview[interview$sex=='m', cols], by='id')
# g2$delta_worm = g2$worm_year - g2$worm_posttreatment
# # Year 1 cotinine vs. reinfection
# cor.test(g2$cotinine_posttreatment[g2$intervention=='treatment'], g2$delta_worm[g$intervention=='treatment'], method='s', alternative='l')
# # Year 2 cotinine vs. reinfection
# cor.test(g2$cotinine_year[g2$intervention=='treatment'], g2$delta_worm[g$intervention=='treatment'], method='s', alternative='l')

# Code for published article

f_cot  = aggregate(cotinine ~ id + pre_post, FUN=function(x) mean(x, na.rm=T), data = samples) # average cotinine values, pre and post treatment
f_worm = aggregate(worm ~ id + pre_post, FUN=function(x) mean(x, na.rm=T), data = samples) # average worm burden, pre and post treatment

f = merge(f_worm, f_cot, all=T)

g = reshape(f, idvar='id', timevar='pre_post', v.names=c('cotinine', 'worm'), direction='wide')

cols = c('id', 'sex', 'age', 'intervention')

# Men only

g = merge(g, interview[interview$sex=='m', cols], by='id')

g$delta_worm = g$worm.year - g$worm.posttreatment

# Didn't measure cotinine after pill #2, so can only use treatment group

# Year 1 cotinine vs. reinfection
cor.test(g$cotinine.posttreatment[g$intervention=='treatment'], g$delta_worm[g$intervention=='treatment'], method='s', alternative='l')

# Year 2 cotinine vs. reinfection
cor.test(g$cotinine.year[g$intervention=='treatment'], g$delta_worm[g$intervention=='treatment'], method='s', alternative='l')

# Scatterplot and regression line

p = qplot(cotinine.posttreatment, delta_worm, geom=c('point', 'smooth'), method=lm, data=g[g$intervention=='treatment',])
p = p + labs(x='\nYear 1 cotinine concentration (ng/ml)', y='Reinfection score\n')
print(p)

############################################
### CYP2A6 polymorphisms vs. worm burden ###
############################################

slow1 = cyp$cyp2a6.1 %in% c('17', '9')
normal1 = cyp$cyp2a6.1 == '1A'
fast1 = cyp$cyp2a6.1 == '1B'

slow2 = cyp$cyp2a6.2 %in% c('17', '9')
normal2 = cyp$cyp2a6.2 == '1A'
fast2 = cyp$cyp2a6.2 == '1B'

cyp$phenotype = rep(NA, nrow(cyp))

cyp$phenotype[slow1] = -1
cyp$phenotype[normal1] = 0
cyp$phenotype[fast1] = 1

cyp$phenotype[slow2] = cyp$phenotype[slow2] - 1
cyp$phenotype[fast2] = cyp$phenotype[fast2] + 1

cyp$phenotype2 = factor(cyp$phenotype>=0, levels=c(F, T), labels=c('Slow', 'Extensive'))

# Genotypes
a = addmargins(table(cyp$cyp2a6.1, cyp$cyp2a6.2))
print(a)

# Allele frequencies
a['Sum',] + a[,'Sum']

# Using negative binomial so sum worm burden as a proxy for egg counts
# Then use "stool count" as an exposure variable

f_cot      = aggregate(cotinine ~ id + pre_post, FUN=mean, data = samples)
f_worm     = aggregate(worm ~ id + pre_post, FUN=sum, data = samples)
f_exposure = aggregate(worm ~ id + pre_post, FUN= function(x) sum(!is.na(x)), data = samples)
names(f_exposure) = c('id', 'pre_post', 'stool_count')

f = merge(f_worm, f_cot, all=T)
f = merge(f_exposure, f, all=T)

g = reshape(f, idvar='id', timevar='pre_post', v.names=c('cotinine', 'worm', 'stool_count'), direction='wide')
cols = c('id', 'sex', 'age', 'intervention')

# Men only

g = merge(g, interview[interview$sex=='m', cols], by='id')

g = merge(g, cyp, by='id')
g$phenotype = ordered(g$phenotype)
g$phenotype2 = factor(g$phenotype>=0)
levels(g$phenotype2) = c('Slow', 'Extensive')
g$age_c = g$age - mean(g$age, na.rm=T)
g$baseline_cotinine = g$cotinine.pretreatment - mean(g$cotinine.pretreatment, na.rm=T)

m_cyp2a6 = gam(worm.pretreatment ~ phenotype2 + s(baseline_cotinine) + s(age_c, by=phenotype2) + offset(log(stool_count.pretreatment)), family=negbin(c(1,10)), data=g)
summary(m_cyp2a6)
gam.check(m_cyp2a6)

# Don't plot baseline cotinine vs. worm, since this plot is very similar to earlier plot
par(mfrow=c(2,1), mar=c(4,4,0,0))
plot(m_cyp2a6, select=3, xlab='', xaxt='n', ylab='Worm burden score')
plot(m_cyp2a6, select=2, jit=T, xlab='Centered age (years)', ylab='Worm burden score')


